# -*- coding: utf-8 -*-
import time
import scrapy
import time
import re, json
from jsonpath import jsonpath

class LagouSpider(scrapy.Spider):
    name = 'lagou'
    allowed_domains = ['lagou.com']
    start_urls = ['https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=']
    headers = dict(Referer="https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=", Cookie='user_trace_token=20190128163744-fbc0d5cb-22d7-11e9-b822-5254005c3644; LGUID=20190128163744-fbc0da67-22d7-11e9-b822-5254005c3644; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221693804d98b31b-09037b0c68253f-3e70055f-1049088-1693804d98c6b1%22%2C%22%24device_id%22%3A%221693804d98b31b-09037b0c68253f-3e70055f-1049088-1693804d98c6b1%22%7D; index_location_city=%E5%85%A8%E5%9B%BD; _ga=GA1.2.2068343149.1548664664; JSESSIONID=ABAAABAABEEAAJA7F47D6D6426DD1A05BFF068478B8C2FA; _gat=1; _gid=GA1.2.705604554.1554528396; LGSID=20190406132633-8a7b31df-582c-11e9-bd7b-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_java%3Fcity%3D%25E5%2585%25A8%25E5%259B%25BD%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1554364643,1554528396; SEARCH_ID=93f97f1f2827479da177051356d223c5; X_HTTP_TOKEN=af7940ed6d63d0870148254551eea683fb7824e6f9; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1554528413; LGRID=20190406132651-94d06ef0-582c-11e9-bd7b-5254005c3644')



    def parse(self, response):
        form_data1 = dict(first='false',pn='0',kd='java')
        yield scrapy.FormRequest(
            'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false',
            formdata=form_data1,
            callback=self.parse2,
            meta={'a': form_data1},
            headers=self.headers
        )


    def parse2(self, response):
        a = response.meta['a']
        a['pn'] = str(int(a['pn']) + 1)  # 构造新的页数
        print('当前url的请求表单', a)
        json_d = json.loads(response.text)
        content_d = json_d["content"]
        info = content_d["positionResult"]["result"]  # 包含每一条数据的列表, 每一条数据是一个字典
        for data in info:
            item = data
            # yield item
            print(item)
        print('当前页面提取成功')
        if a['pn'] == '30':  # 只有30页
            return
        time.sleep(2)
        yield scrapy.FormRequest(
            'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false',
            formdata=a,
            callback=self.parse2,
            meta={'a':a},
            headers=self.headers

        )
